import data

bcdata = read_csv("bcdata_Assignment1.csv") |> janitor::clean_names()

Q1 get quantitative fesures

summary = skimr::skim(bcdata) |> 
  select(skim_variable, numeric.mean, numeric.p50, numeric.p0, numeric.p100) |> knitr::kable(digits = 2) |> print()
## 
## 
## |skim_variable  | numeric.mean| numeric.p50| numeric.p0| numeric.p100|
## |:--------------|------------:|-----------:|----------:|------------:|
## |age            |        57.30|       56.00|      24.00|        89.00|
## |bmi            |        27.58|       27.66|      18.37|        38.58|
## |glucose        |        97.79|       92.00|      60.00|       201.00|
## |insulin        |        10.01|        5.92|       2.43|        58.46|
## |homa           |         2.69|        1.38|       0.47|        25.05|
## |leptin         |        26.62|       20.27|       4.31|        90.28|
## |adiponectin    |        10.18|        8.35|       1.66|        38.04|
## |resistin       |        14.73|       10.83|       3.21|        82.10|
## |mcp_1          |       534.65|      471.32|      45.84|      1698.44|
## |classification |         1.55|        2.00|       1.00|         2.00|
# There's no missing in this dataset.

Q2 catogorize bmi based on WHO criteria

q2 = bcdata |> mutate(who_bmi =
                   ifelse(bmi < 16.5, "Severely underweight",
                          ifelse(16.5 <= bmi & bmi < 18.5, "Underweight",
                                 ifelse(18.5 <= bmi & bmi < 25, "Normal weight",
                                        ifelse(25 <= bmi & bmi < 30, "Overweight", 
                                               ifelse(30 <= bmi & bmi < 35, "Obesity class I",
                                                      ifelse(35 <= bmi & bmi < 40, "Obesity class II","Obesity class III"))))))) |> 
  mutate(classification = recode(classification, "1" = "Healthy Controls", "2" = "Breast Cancer Patients")) |> arrange(bmi)



#check if I have recoded bmi correctly
table(q2$bmi, q2$who_bmi)

Q3 draw bar plot

#Since there's no healthy controls in underweight category, I added a category "underweight healthhy controls" with 0 count to make sure every column has the same width.
freq_table = q2 |> group_by(who_bmi, classification) |>
  summarise(n = n()) |> mutate(proportion = n / sum(n) * 100)

supp = data.frame(who_bmi = "Underweight", 
              classification = "Healthy Controls",
              n = 0, proportion = 0)

final_freq = bind_rows(supp, freq_table) |> mutate(who_bmi = as.factor(who_bmi)) 

level = c("Severely underweight", "Underweight", "Normal weight", "Overweight", "Obesity class I", "Obesity class II", "Obesity class III")

final_freq |> 
  mutate(who_bmi = forcats::fct_relevel(who_bmi, level),
         text_label = str_c(proportion, "%")) |>
  plot_ly(x = ~who_bmi, y = ~proportion, color = ~classification, type = "bar", colors = "viridis", text = ~text_label) |>
  layout(title = 'Barplot showing the proportion of breast cancer cases and controls by WHO BMI category', xaxis = list(title = 'WHO BMI Category'), yaxis = list(title = 'Proportion(%)'), barmode = "stack")

Q4 regression model

q4 = bcdata |> 
  mutate(classification = recode(classification, "1" = 0, "2" = 1))
# recode 1-healthy controls as "0", 2-breast cancer patients as "1"
table(q4$classification, bcdata$classification)
##    
##      1  2
##   0 52  0
##   1  0 64
# recoded correctly

mylogit = glm(classification ~ glucose + homa + leptin + bmi + age, data = q4, family = "binomial")
mylogit |> broom::tidy() |> knitr::kable(digits = 2)
term estimate std.error statistic p.value
(Intercept) -3.63 2.36 -1.54 0.12
glucose 0.08 0.02 3.47 0.00
homa 0.27 0.17 1.59 0.11
leptin -0.01 0.02 -0.54 0.59
bmi -0.10 0.06 -1.84 0.07
age -0.02 0.01 -1.59 0.11
cbind(estimate = coef(mylogit), confint(mylogit)) |> knitr::kable(digits = 2)
estimate 2.5 % 97.5 %
(Intercept) -3.63 -8.54 0.75
glucose 0.08 0.04 0.13
homa 0.27 -0.03 0.65
leptin -0.01 -0.04 0.02
bmi -0.10 -0.22 0.00
age -0.02 -0.05 0.00
exp(cbind(OR = coef(mylogit), confint(mylogit))) |> knitr::kable(digits = 2)
OR 2.5 % 97.5 %
(Intercept) 0.03 0.00 2.13
glucose 1.09 1.04 1.14
homa 1.32 0.97 1.92
leptin 0.99 0.96 1.02
bmi 0.90 0.80 1.00
age 0.98 0.95 1.00

Q5 linear regression

q5 = bcdata
fit = lm(insulin ~ bmi + age + glucose, data = q5)
fit |> broom::tidy() |> knitr::kable(digits = 2)
term estimate std.error statistic p.value
(Intercept) -13.50 5.86 -2.30 0.02
bmi 0.15 0.16 0.91 0.36
age -0.05 0.05 -1.04 0.30
glucose 0.23 0.04 6.13 0.00
cbind(estimate = coef(fit), confint(fit)) |> knitr::kable(digits = 2)
estimate 2.5 % 97.5 %
(Intercept) -13.50 -25.11 -1.89
bmi 0.15 -0.17 0.47
age -0.05 -0.16 0.05
glucose 0.23 0.16 0.30